# source("baflights19.R")
library(tidyverse)
library(data.table)
library(purrr)
library(arsenal)
library(labelled)
library(gtsummary)
library(tibble)
library(sjlabelled)
library(expss)
library(knitr)
library(pacman)
p_load(tidyverse, nycflights13, tictoc)

The baflights19.R saves each tibble as an .Rds file within the data subdirectory. To load the tibbles, the read_rds function from the readr R package is employed.

airlines_new <- read_rds("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/airlines.Rds")
airports_new <- read_rds("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/airports.Rds")
flights_new <- read_rds("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/flights.Rds")
planes_new <- read_rds("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/planes.Rds")
weather_new <- read_rds("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/weather.Rds")

nycflights13

flights

baflights19

flights_new
fs::dir_ls("C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data")
## C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/airlines.Rds
## C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/airports.Rds
## C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/flights.Rds
## C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/planes.Rds
## C:/Users/bichl/Documents/CSUEB/STAT 650/Project STAT 650/baflights19/data/weather.Rds

Part I.

  1. Upon downloading the data, the initial focus will be on processing the first month's data. Subsequently, the final phase will involve incorporating all available data and conducting a comprehensive analysis for the entirety of 2020. The dataset encompasses all departing flights originating from the Bay Area, encompassing departures from San Francisco (SFO), Oakland (OAK), and San Jose (SJC) airports. The analysis seeks to determine the total count of departing flights in January 2020 and the distribution of departures across each airport during that month.
dep_flights <- flights_new %>% filter(!is.na(arr_delay)) %>% select(month, origin) %>% filter(month==1) %>% mutate(n= map_int(month,length)) %>% drop_na()

dep_flights
dim(dep_flights)
## [1] 22039     3

A total of 22,039 flights departed in January.

dep_flights %>% filter(origin=="SFO") %>% dim()
## [1] 13078     3

In January, San Francisco Airport recorded 13,078 flights.

dep_flights %>% filter(origin=="SJC") %>% dim()
## [1] 4841    3

In January, San Jose Airport registered 4,841 flights.

dep_flights %>% filter(origin=="OAK") %>% dim()
## [1] 4120    3

In January, Oakland Airport logged 4,120 flights.

  1. Examine and compare the variables present in the baflights20 and nycflights13 data frames. Construct a table showcasing the variables shared between both datasets, accompanied by brief descriptions for each variable, akin to an abbreviated codebook. Differences in variables between the two datasets should also be documented.
comparedf(flights,flights_new)
## Compare Object
## 
## Function Call: 
## comparedf(x = flights, y = flights_new)
## 
## Shared: 19 non-by variables and 286933 observations.
## Not shared: 0 variables and 49843 observations.
## 
## Differences found in 19/19 variables compared.
## 1 variables compared have non-identical attributes.
#summary(comparedf(flights,flights_new))

``

data(flights)
flights = apply_labels(flights,
                      year = "Year of departure.",
                      month = "Month of departure.",
                      day = "Day of departure.)",
                      dep_time ="Actual departure (format HHMM or HMM), local tz.",
                      sched_dep_time = "Scheduled departure (format HHMM or HMM), local tz.",
                      dep_delay = "Departure delays, in minutes. Negative times represent early departures.",
                      arr_time = "Actual arrival times (format HHMM or HMM), local tz.",
                      sched_arr_time = "Scheduled arrival times (format HHMM or HMM), local tz.",
                      arr_delay = "Arrival delays, in minutes. Negative times represent early departures/arrivals.",
                      carrier = "Two letter carrier abbreviation. See airlines to get name.",
                      flight = "Flight number.",
                      tailnum = "Plane tail number. See planes for additional metadata.",
                      origin = "Origin. See airports for additional metadata.",
                      dest = "destination. See airports for additional metadata.",
                      air_time = "Amount of time spent in the air, in minutes.",
                      distance = "Distance between airports, in miles.",
                      hour = "Time of scheduled departure broken into hour.",
                      minute = "Time of scheduled departure broken into minutes.",
                      time_hour = "Scheduled date and hour of the flight as a POSIXct date. Along with origin, can be used to join flights data to weather data."
)
codebook <- enframe(get_label(flights))

kable(codebook)
name value
year Year of departure.
month Month of departure.
day Day of departure.)
dep_time Actual departure (format HHMM or HMM), local tz.
sched_dep_time Scheduled departure (format HHMM or HMM), local tz.
dep_delay Departure delays, in minutes. Negative times represent early departures.
arr_time Actual arrival times (format HHMM or HMM), local tz.
sched_arr_time Scheduled arrival times (format HHMM or HMM), local tz.
arr_delay Arrival delays, in minutes. Negative times represent early departures/arrivals.
carrier Two letter carrier abbreviation. See airlines to get name.
flight Flight number.
tailnum Plane tail number. See planes for additional metadata.
origin Origin. See airports for additional metadata.
dest destination. See airports for additional metadata.
air_time Amount of time spent in the air, in minutes.
distance Distance between airports, in miles.
hour Time of scheduled departure broken into hour.
minute Time of scheduled departure broken into minutes.
time_hour Scheduled date and hour of the flight as a POSIXct date. Along with origin, can be used to join flights data to weather data.

3. Identify the month with the highest proportion of cancelled flights and the month with the lowest proportion.

flights2 <- flights_new %>%
group_by(month) %>%
summarize(cancelled = sum(is.na(arr_delay)),total = n(), prop_cancelled = cancelled/total)

flights2
ggplot(data=flights2,aes(x=month,y=prop_cancelled)) + geom_point() + labs(title = " Proportion of Cancelled Flights Each Month", y = " proportion cancelled")

February exhibited the highest proportion of cancelled flights, whereas October had the lowest. Notably, there appears to be a notable spike in cancellation proportions during the winter and spring months (January, February, March, April) as well as during the summer and autumn months (June, July, October). These patterns suggest that specific circumstances during these seasons likely influence flight cancellations, potentially attributable to severe weather conditions.

4. Identify the aircraft (specified by the tailnum variable) that made the most trips from Bay Area airports in 2019. Additionally, create a plot illustrating the number of trips per week throughout the year.

flights_new %>%
  filter(!is.na(tailnum)) %>%
  group_by(tailnum) %>%
  summarise(count = n()) %>% arrange(desc(count))%>%
  head()

The aircraft N521VA had the highest number of trips from Bay Area airports in 2019.

flights2 <- flights_new %>%
  filter(tailnum == "N521VA") %>%
  mutate(week=week(time_hour)) %>%
  group_by(week) %>%
  summarise(count=n())

ggplot(data=flights2, aes(x=week,y=count)) +geom_point() + geom_line() + labs(title="Number of Trip from Bay Area for Plane N521VA Each Week")

5. Determine the oldest aircraft (identified by thetailnum variable) that departed from Bay Area airports in 2019. Additionally, ascertain the total count of aircraft that flew from Bay Area airports and are included in the planes table.

planes2 <- select(planes_new,tailnum,year, manufacturer)
flights2 <- select(flights_new,tailnum)
Bay_flights <- left_join(planes2,flights2) %>%
arrange(year) %>%
  unique() %>% arrange(year)
## Joining, by = "tailnum"
head(Bay_flights)

The aircraft with the tail number N990JB, manufactured in 1977, is the oldest among those that flew from Bay Area airports in 2019.

Bay_flights2 <- distinct(Bay_flights)
nrow(Bay_flights2)
## [1] 4031

There are a total of 4031 distinct airplanes included in the dataset.

Determine the count of airplanes for which the date of manufacture is missing.

Bay_flights2 <- Bay_flights %>%
  filter(is.na(year)) %>%
  distinct(tailnum)
nrow(Bay_flights2)
## [1] 79

A total of 79 airplanes lack information regarding their date of manufacture.

6. Identify the top five manufacturers with the highest frequency of aircraft. Additionally, analyze whether there have been changes in the distribution of manufacturers over time, as evidenced by the airplanes flying from the Bay Area in 2019.

manufacturers <- Bay_flights %>%
  select(manufacturer,tailnum,year) %>%
  unique() %>%
  group_by() %>%
  group_by(manufacturer) %>% 
  summarise(count=n()) %>%
  arrange(desc(count))

manufacturers

The top five manufacturers by frequency are Boeing, Airbus, Bombardier Inc, Embraer S A, C Series Aircradt LTD PTNRSP, Cessna

manufacturers <- Bay_flights %>%
  mutate(manufacturer = 
           case_when(
             manufacturer=="AIRBUS INDUSTRIE" ~"AIRBUS", 
             manufacturer=="AIRBUS CANADA LTD PTNRSP"~"AIRBUS",
             manufacturer=="AIRBUS SAS" ~ "AIRBUS",
             manufacturer=="EMBRAER S A"~"EMBRAER",TRUE~manufacturer)) %>%
  select(manufacturer,tailnum,year) %>%
  unique() %>%
  group_by(manufacturer)%>%
  summarise(count=n())%>%
  arrange(desc(count))

manufacturers
Bay_flights2 <- left_join(manufacturers,Bay_flights) %>%
  mutate(manu = ifelse(count >100, manufacturer,"Other"))
## Joining, by = "manufacturer"
Bay_flights2 %>%
  group_by(manu)%>%
  summarize(avgyear = mean( year, na.rm = TRUE)) %>%
  arrange(desc(avgyear))

In historical usage, the predominant aircraft manufacturers have been Boeing and Bombardier, with Airbus and Embraer specializing in newer aircraft models.

PART II:

1. The inquiry concerns the temperature distribution in July 2013. It aims to identify significant outliers in terms of wind speed. Additionally, it seeks to analyze the relationship between dew and humidity, as well as the relationship between precipitation and visibility.

head(weather_new)
weather_new %>%
      filter(month == 7) %>%
      ggplot(aes(x = temp)) +
      geom_histogram() +
      labs(x = "Temperature", y = "Counts", title = "Distribution of temperature in July, 2019")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2230 rows containing non-finite values (stat_bin).

# Calculate the quartile

qua_windspeed <- quantile(weather_new$wind_speed, na.rm = TRUE)

qua_windspeed
##       0%      25%      50%      75%     100% 
##  0.00000  4.60312  8.05546 12.65858 41.42808
#Calculate the interquartile range

    IQR <- as.numeric(qua_windspeed[4]) - as.numeric(qua_windspeed[2])

IQR
## [1] 8.05546
    weather_new %>%
      filter((wind_speed >= as.numeric(qua_windspeed[4]) + 3*IQR) | (wind_speed <= as.numeric(qua_windspeed[2]) - 3*IQR)) %>%
      arrange(desc(wind_speed))
boxplot(weather_new$wind_speed)

2. The investigation involves determining the number of days with precipitation in the Bay Area during 2019. Additionally, it examines potential disparities in mean visibility (visib) concerning both the day of the week and the month of the year.

weather_new %>%
 
  select(day,precip,origin)%>%
  filter(precip == 0 & origin == c("SFO","OAK","SJC")) %>%
  summarise( n=n(),sum(n))
## Warning in origin == c("SFO", "OAK", "SJC"): longer object length is not a
## multiple of shorter object length

The count of rainless days totals 48. There were 317 days with precipitation in the Bay Area in 2019.

Avg_vib_day <- weather_new %>% 
  
   mutate(Dayofwk = wday(time_hour)) %>%
   select(Dayofwk, visib ) %>% 
   filter(!is.na(visib)) %>%
   group_by(Dayofwk) %>%
   summarise(Avg_day = mean(visib))

Avg_vib_day
Avg_vib_mth <- weather_new %>% 
  
   select(month, visib ) %>% 
   filter(!is.na(visib)) %>%
   group_by(month) %>%
   summarise(Avg_mth = mean(visib))

Avg_vib_mth

Mean visibility varied notably based on the day of the week and month of the year, particularly during November and December.